AAQoL machine learning analysis with unbalanced random forest

Author

Miguel Fudolig

library(tidyverse)
library(ggplot2)
library(lavaan)
library(car)
library(glmnet)
library(randomForestSRC)

Data set

This data set is from the 2015 Asian American Quality of Life survey. Participants are from Austin, Texas.

Input data set

qol <- read_csv("AAQoL.csv") |> mutate(across(where(is.character), ~as.factor(.x))) |> 
  mutate(`English Difficulties`=relevel(`English Difficulties`,ref="Not at all"),
         `English Speaking`=relevel(`English Speaking`,ref="Not at all"),
         Ethnicity = relevel(Ethnicity,ref="Chinese")) |> 
  mutate(Income_median = case_match(Income,"$0 - $9,999"~"Below",
                                         "$10,000 - $19,999" ~"Below",
                                         "$20,000 - $29,999"~"Below",
                                         "$30,000 - $39,999"~"Below",
                                         "$40,000 - $49,999"~"Below",
                                         "$50,000 - $59,999"~"Below",
                                         "$60,000 - $69,999"~"Above",
                                         "$70,000 and over"~"Above",
                                          .default=Income)) |> 
  mutate(Income_median = factor(Income_median, levels=c("Below","Above")))
New names:
Rows: 2609 Columns: 231
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(190): Gender, Ethnicity, Marital Status, No One, Spouse, Children, Gran... dbl
(41): Survey ID, Age, Education Completed, Household Size, Grandparent,...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `Other` -> `Other...17`
• `Other` -> `Other...89`
qol |> DT::datatable()
Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html

Source of Information: Family

ps(Family)
# A tibble: 4 × 3
  Family     n     pct
  <fct>  <int>   <dbl>
1 3          1  0.0383
2 No      1258 48.2   
3 Yes     1331 51.0   
4 <NA>      19  0.728 
rfdata <- qol |> filter(Family %in% c("No","Yes")) |> 
  mutate(Family=droplevels(Family)) |> 
  select(Family, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
  filter(!is.na(Family)) |> 
  # na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame()

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Family ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="gini")
print(rfobj)
                         Sample size: 2187
           Frequency of class labels: 1069, 1118
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 481.122
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1382
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0458
                   (OOB) Brier score: 0.23047767
        (OOB) Normalized Brier score: 0.92191067
                           (OOB) AUC: 0.65538154
                        (OOB) PR-AUC: 0.61755928
                        (OOB) G-mean: 0.61386047
   (OOB) Requested performance error: 0.38613953

Confusion matrix:

          predicted
  observed  No Yes class.error
       No  707 362      0.3386
       Yes 481 637      0.4302

      (OOB) Misclassification rate: 0.3854595
print(rfobj)
                         Sample size: 2187
           Frequency of class labels: 1069, 1118
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 481.122
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1382
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0458
                   (OOB) Brier score: 0.23047767
        (OOB) Normalized Brier score: 0.92191067
                           (OOB) AUC: 0.65538154
                        (OOB) PR-AUC: 0.61755928
                        (OOB) G-mean: 0.61386047
   (OOB) Requested performance error: 0.38613953

Confusion matrix:

          predicted
  observed  No Yes class.error
       No  707 362      0.3386
       Yes 481 637      0.4302

      (OOB) Misclassification rate: 0.3854595
plot(rfobj,plots.one.page = FALSE)


                              all   No   Yes
Age                        0.0322   NA    NA
Ethnicity                  0.0299   NA    NA
EnglishSpeak               0.0113   NA    NA
Gender                     0.0107   NA    NA
Religion                   0.0092   NA    NA
EnglishDiff                0.0090   NA    NA
Get Along                  0.0081   NA    NA
Similar Values             0.0071   NA    NA
Helpful Family             0.0071   NA    NA
Religious Importance       0.0070   NA    NA
Employment                 0.0063   NA    NA
Close Family               0.0062   NA    NA
Spend Time Together        0.0061   NA    NA
Community Trust            0.0056   NA    NA
Community Shares Values    0.0046   NA    NA
Helpful Community          0.0043   NA    NA
Family Pride               0.0042   NA    NA
Feel Close                 0.0039   NA    NA
Close-knit Community       0.0037   NA    NA
Trust                      0.0030   NA    NA
Expression                 0.0029   NA    NA
Loyalty                    0.0029   NA    NA
See Friends                0.0027   NA    NA
Successful Family          0.0020   NA    NA
Income_median              0.0001   NA    NA
Close Friends             -0.0004   NA    NA
rfobj$importance
                                  all No Yes
Ethnicity                0.0298736171 NA  NA
Age                      0.0321993406 NA  NA
Gender                   0.0107122858 NA  NA
Religion                 0.0091633231 NA  NA
Employment               0.0062734484 NA  NA
Income_median            0.0001492718 NA  NA
EnglishSpeak             0.0112882719 NA  NA
EnglishDiff              0.0089896934 NA  NA
See Family              -0.0009159657 NA  NA
Close Family             0.0062397098 NA  NA
Helpful Family           0.0071037309 NA  NA
See Friends              0.0027465503 NA  NA
Close Friends           -0.0004087780 NA  NA
Helpful Friends         -0.0014438134 NA  NA
Family Respect          -0.0013574569 NA  NA
Similar Values           0.0071457920 NA  NA
Successful Family        0.0020479867 NA  NA
Trust                    0.0030046939 NA  NA
Loyalty                  0.0028684188 NA  NA
Family Pride             0.0041954838 NA  NA
Expression               0.0029314166 NA  NA
Spend Time Together      0.0061082163 NA  NA
Feel Close               0.0038668761 NA  NA
Togetherness            -0.0024044764 NA  NA
Religious Attendance    -0.0012261991 NA  NA
Religious Importance     0.0069630886 NA  NA
Close-knit Community     0.0037125808 NA  NA
Helpful Community        0.0042750898 NA  NA
Community Shares Values  0.0046403124 NA  NA
Get Along                0.0081499187 NA  NA
Community Trust          0.0056464934 NA  NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

pos<- rfdata |> filter(Family=="Yes")
neg <- rfdata |> filter(Family=="No")

set.seed(222)
ind_pos <- sample(c(0,1), nrow(pos), replace = T, prob = c(0.7, 0.3))
ind_neg <- sample(c(0,1), nrow(neg), replace = T, prob = c(0.7, 0.3))


train <- bind_rows(pos[ind_pos==0,],neg[ind_neg==0,])
test <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Family ~ .,importance=T,data=train,
                    perf.type = "gmean",splitrule="gini")
print(rfobj)
                         Sample size: 1518
           Frequency of class labels: 737, 781
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 338.252
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 959
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0597
                   (OOB) Brier score: 0.23811998
        (OOB) Normalized Brier score: 0.95247992
                           (OOB) AUC: 0.61983862
                        (OOB) PR-AUC: 0.57893367
                        (OOB) G-mean: 0.58067382
   (OOB) Requested performance error: 0.41932618

Confusion matrix:

          predicted
  observed  No Yes class.error
       No  461 276      0.3745
       Yes 360 421      0.4609

      (OOB) Misclassification rate: 0.4189723
print(rfobj)
                         Sample size: 1518
           Frequency of class labels: 737, 781
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 338.252
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 959
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0597
                   (OOB) Brier score: 0.23811998
        (OOB) Normalized Brier score: 0.95247992
                           (OOB) AUC: 0.61983862
                        (OOB) PR-AUC: 0.57893367
                        (OOB) G-mean: 0.58067382
   (OOB) Requested performance error: 0.41932618

Confusion matrix:

          predicted
  observed  No Yes class.error
       No  461 276      0.3745
       Yes 360 421      0.4609

      (OOB) Misclassification rate: 0.4189723
plot(rfobj,plots.one.page = FALSE)


                              all   No   Yes
Age                        0.0470   NA    NA
Ethnicity                  0.0102   NA    NA
Helpful Family             0.0069   NA    NA
Community Shares Values    0.0053   NA    NA
Religious Importance       0.0046   NA    NA
Close-knit Community       0.0034   NA    NA
Get Along                  0.0024   NA    NA
EnglishSpeak               0.0019   NA    NA
Togetherness               0.0017   NA    NA
Helpful Friends            0.0011   NA    NA
Spend Time Together        0.0009   NA    NA
Helpful Community          0.0004   NA    NA
Close Family               0.0002   NA    NA
Loyalty                   -0.0004   NA    NA
Trust                     -0.0009   NA    NA
Family Respect            -0.0011   NA    NA
EnglishDiff               -0.0015   NA    NA
Religion                  -0.0015   NA    NA
Family Pride              -0.0019   NA    NA
Gender                    -0.0029   NA    NA
Similar Values            -0.0029   NA    NA
Expression                -0.0032   NA    NA
Feel Close                -0.0035   NA    NA
Religious Attendance      -0.0035   NA    NA
Community Trust           -0.0038   NA    NA
Income_median             -0.0045   NA    NA
rfobj$importance
                                  all No Yes
Ethnicity                0.0102038208 NA  NA
Age                      0.0470496266 NA  NA
Gender                  -0.0028532605 NA  NA
Religion                -0.0015283531 NA  NA
Employment              -0.0080802923 NA  NA
Income_median           -0.0045255872 NA  NA
EnglishSpeak             0.0018924781 NA  NA
EnglishDiff             -0.0015283531 NA  NA
See Family              -0.0047036867 NA  NA
Close Family             0.0001690678 NA  NA
Helpful Family           0.0068515155 NA  NA
See Friends             -0.0075843402 NA  NA
Close Friends           -0.0093477945 NA  NA
Helpful Friends          0.0011125622 NA  NA
Family Respect          -0.0010596669 NA  NA
Similar Values          -0.0029261995 NA  NA
Successful Family       -0.0075090251 NA  NA
Trust                   -0.0008505768 NA  NA
Loyalty                 -0.0003604139 NA  NA
Family Pride            -0.0018863303 NA  NA
Expression              -0.0032059623 NA  NA
Spend Time Together      0.0009357273 NA  NA
Feel Close              -0.0034885651 NA  NA
Togetherness             0.0017334100 NA  NA
Religious Attendance    -0.0035480431 NA  NA
Religious Importance     0.0046485954 NA  NA
Close-knit Community     0.0033922685 NA  NA
Helpful Community        0.0004130311 NA  NA
Community Shares Values  0.0053396877 NA  NA
Get Along                0.0024330361 NA  NA
Community Trust         -0.0037665724 NA  NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test)
get.imbalanced.performance(rfobj)
 n.majority  n.minority      iratio   threshold        sens        spec 
781.0000000 737.0000000   1.0597015   0.4855072   0.6255088   0.5390525 
       prec         npv    misclass       brier  brier.norm         auc 
  0.5615104   0.6040172   0.4189723   0.2381200   0.9524799   0.6198386 
         F1       F1mod pr.auc.rand      pr.auc     F1gmean  F1modgmean 
  0.5917843   0.5805264   0.4855072   0.5789337   0.5862291   0.5806001 
      gmean 
  0.5806738 

Source of Information: Health Professionals

ps(`Heal Professionals`)
# A tibble: 3 × 3
  `Heal Professionals`     n    pct
  <fct>                <int>  <dbl>
1 No                    1326 50.8  
2 Yes                   1264 48.4  
3 <NA>                    19  0.728
rfdata <- qol |> 
  select(`Heal Professionals`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame() 

imbalanced(`Heal Professionals` ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="gini")->rfobj

print(rfobj)
                         Sample size: 2188
           Frequency of class labels: 1067, 1121
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 481.7493
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1383
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0506
                   (OOB) Brier score: 0.23121035
        (OOB) Normalized Brier score: 0.92484139
                           (OOB) AUC: 0.65856211
                        (OOB) PR-AUC: 0.62968925
                        (OOB) G-mean: 0.61617778
   (OOB) Requested performance error: 0.38382222

Confusion matrix:

          predicted
  observed  No Yes class.error
       No  662 405      0.3796
       Yes 435 686      0.3880

      (OOB) Misclassification rate: 0.3839122
plot(rfobj,plots.one.page = FALSE)


                              all   No   Yes
EnglishSpeak               0.0297   NA    NA
Income_median              0.0077   NA    NA
See Friends                0.0077   NA    NA
Close Friends              0.0054   NA    NA
See Family                 0.0050   NA    NA
Community Trust            0.0028   NA    NA
Community Shares Values    0.0027   NA    NA
Close-knit Community       0.0027   NA    NA
Expression                 0.0023   NA    NA
Close Family               0.0013   NA    NA
Helpful Friends            0.0013   NA    NA
Age                        0.0009   NA    NA
Family Respect             0.0009   NA    NA
Get Along                  0.0005   NA    NA
Employment                 0.0004   NA    NA
Togetherness               0.0002   NA    NA
Similar Values             0.0000   NA    NA
Trust                     -0.0004   NA    NA
Helpful Community         -0.0018   NA    NA
Helpful Family            -0.0023   NA    NA
Gender                    -0.0023   NA    NA
Successful Family         -0.0031   NA    NA
Loyalty                   -0.0032   NA    NA
Religious Importance      -0.0033   NA    NA
Family Pride              -0.0036   NA    NA
Religious Attendance      -0.0038   NA    NA
rfobj$importance
                                  all No Yes
Ethnicity               -6.229580e-03 NA  NA
Age                      8.846067e-04 NA  NA
Gender                  -2.334744e-03 NA  NA
Religion                -4.843823e-03 NA  NA
Employment               4.085398e-04 NA  NA
Income_median            7.695117e-03 NA  NA
EnglishSpeak             2.969744e-02 NA  NA
EnglishDiff             -5.583815e-03 NA  NA
See Family               5.038137e-03 NA  NA
Close Family             1.335206e-03 NA  NA
Helpful Family          -2.277969e-03 NA  NA
See Friends              7.678630e-03 NA  NA
Close Friends            5.405558e-03 NA  NA
Helpful Friends          1.278098e-03 NA  NA
Family Respect           8.601492e-04 NA  NA
Similar Values           1.696055e-05 NA  NA
Successful Family       -3.091235e-03 NA  NA
Trust                   -4.489456e-04 NA  NA
Loyalty                 -3.208678e-03 NA  NA
Family Pride            -3.582459e-03 NA  NA
Expression               2.278251e-03 NA  NA
Spend Time Together     -5.470183e-03 NA  NA
Feel Close              -4.070599e-03 NA  NA
Togetherness             1.736981e-04 NA  NA
Religious Attendance    -3.752408e-03 NA  NA
Religious Importance    -3.296409e-03 NA  NA
Close-knit Community     2.713520e-03 NA  NA
Helpful Community       -1.793826e-03 NA  NA
Community Shares Values  2.743503e-03 NA  NA
Get Along                4.832193e-04 NA  NA
Community Trust          2.760539e-03 NA  NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

pos<- rfdata |> filter(`Heal Professionals`=="Yes")
neg <- rfdata |> filter(`Heal Professionals`=="No")

set.seed(222)
ind_pos <- sample(c(0,1), nrow(pos), replace = T, prob = c(0.7, 0.3))
ind_neg <- sample(c(0,1), nrow(neg), replace = T, prob = c(0.7, 0.3))


train <- bind_rows(pos[ind_pos==0,],neg[ind_neg==0,])
test <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(`Heal Professionals` ~ .,importance=T,data=train,
                    perf.type = "gmean",splitrule="gini")
print(rfobj)
                         Sample size: 1523
           Frequency of class labels: 735, 788
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 337.315
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 963
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0721
                   (OOB) Brier score: 0.23687942
        (OOB) Normalized Brier score: 0.94751767
                           (OOB) AUC: 0.63561932
                        (OOB) PR-AUC: 0.60730436
                        (OOB) G-mean: 0.60037858
   (OOB) Requested performance error: 0.39962142

Confusion matrix:

          predicted
  observed  No Yes class.error
       No  448 287      0.3905
       Yes 322 466      0.4086

      (OOB) Misclassification rate: 0.3998687
print(rfobj)
                         Sample size: 1523
           Frequency of class labels: 735, 788
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 337.315
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 963
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.0721
                   (OOB) Brier score: 0.23687942
        (OOB) Normalized Brier score: 0.94751767
                           (OOB) AUC: 0.63561932
                        (OOB) PR-AUC: 0.60730436
                        (OOB) G-mean: 0.60037858
   (OOB) Requested performance error: 0.39962142

Confusion matrix:

          predicted
  observed  No Yes class.error
       No  448 287      0.3905
       Yes 322 466      0.4086

      (OOB) Misclassification rate: 0.3998687
plot(rfobj,plots.one.page = FALSE)


                              all   No   Yes
EnglishDiff                0.0152   NA    NA
EnglishSpeak               0.0093   NA    NA
Income_median              0.0091   NA    NA
Gender                     0.0026   NA    NA
Spend Time Together        0.0007   NA    NA
Community Trust            0.0006   NA    NA
See Friends                0.0000   NA    NA
Community Shares Values   -0.0005   NA    NA
Successful Family         -0.0006   NA    NA
Expression                -0.0007   NA    NA
Get Along                 -0.0011   NA    NA
Loyalty                   -0.0013   NA    NA
Close Friends             -0.0014   NA    NA
Similar Values            -0.0019   NA    NA
Close Family              -0.0026   NA    NA
Family Pride              -0.0027   NA    NA
Feel Close                -0.0032   NA    NA
See Family                -0.0032   NA    NA
Helpful Family            -0.0032   NA    NA
Religious Attendance      -0.0033   NA    NA
Togetherness              -0.0038   NA    NA
Religion                  -0.0058   NA    NA
Family Respect            -0.0060   NA    NA
Employment                -0.0066   NA    NA
Helpful Friends           -0.0073   NA    NA
Helpful Community         -0.0078   NA    NA
rfobj$importance
                                  all No Yes
Ethnicity               -1.119613e-02 NA  NA
Age                     -9.199712e-03 NA  NA
Gender                   2.628499e-03 NA  NA
Religion                -5.806950e-03 NA  NA
Employment              -6.595408e-03 NA  NA
Income_median            9.104856e-03 NA  NA
EnglishSpeak             9.258181e-03 NA  NA
EnglishDiff              1.515708e-02 NA  NA
See Family              -3.212321e-03 NA  NA
Close Family            -2.571226e-03 NA  NA
Helpful Family          -3.243786e-03 NA  NA
See Friends             -2.444395e-05 NA  NA
Close Friends           -1.429018e-03 NA  NA
Helpful Friends         -7.326022e-03 NA  NA
Family Respect          -5.986364e-03 NA  NA
Similar Values          -1.929449e-03 NA  NA
Successful Family       -6.151098e-04 NA  NA
Trust                   -7.796050e-03 NA  NA
Loyalty                 -1.286987e-03 NA  NA
Family Pride            -2.674305e-03 NA  NA
Expression              -6.696919e-04 NA  NA
Spend Time Together      7.309024e-04 NA  NA
Feel Close              -3.212321e-03 NA  NA
Togetherness            -3.817017e-03 NA  NA
Religious Attendance    -3.272389e-03 NA  NA
Religious Importance    -1.390461e-02 NA  NA
Close-knit Community    -8.577679e-03 NA  NA
Helpful Community       -7.760562e-03 NA  NA
Community Shares Values -5.490300e-04 NA  NA
Get Along               -1.107606e-03 NA  NA
Community Trust          6.214982e-04 NA  NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test)
get.imbalanced.performance(rfobj)
 n.majority  n.minority      iratio   threshold        sens        spec 
788.0000000 735.0000000   1.0721088   0.4826001   0.6095238   0.5913706 
       prec         npv    misclass       brier  brier.norm         auc 
  0.5818182   0.6188579   0.3998687   0.2368794   0.9475177   0.6356193 
         F1       F1mod pr.auc.rand      pr.auc     F1gmean  F1modgmean 
  0.5953488   0.6000382   0.4826001   0.6073044   0.5978637   0.6002084 
      gmean 
  0.6003786 

Health Insurance

ps(`Health Insurance`)
# A tibble: 3 × 3
  `Health Insurance`     n    pct
  <fct>              <int>  <dbl>
1 0                    381 14.6  
2 Yes                 2207 84.6  
3 <NA>                  21  0.805

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Health Insurance`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame()

imb <- imbalanced(`Health Insurance` ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="gini")
print(imb)
                         Sample size: 2189
           Frequency of class labels: 292, 1897
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 258.4477
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1383
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 6.4966
                   (OOB) Brier score: 0.1037283
        (OOB) Normalized Brier score: 0.41491322
                           (OOB) AUC: 0.7475827
                        (OOB) PR-AUC: 0.33023442
                        (OOB) G-mean: 0.66755909
   (OOB) Requested performance error: 0.33244091

Confusion matrix:

          predicted
  observed   0  Yes class.error
       0   224   68      0.2329
       Yes 795 1102      0.4191

      (OOB) Misclassification rate: 0.3942439
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1897.0000000  292.0000000    6.4965753    0.1333942    0.7671233    0.5809172 
        prec          npv     misclass        brier   brier.norm          auc 
   0.2198234    0.9418803    0.3942439    0.1037283    0.4149132    0.7475827 
          F1        F1mod  pr.auc.rand       pr.auc      F1gmean   F1modgmean 
   0.3417239    0.4631881    0.1333942    0.3302344    0.5046415    0.5653736 
       gmean 
   0.6675591 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

pos<- rfdata |> filter(`Health Insurance`=="Yes")
neg <- rfdata |> filter(`Health Insurance`=="No")

set.seed(222)
ind_pos <- sample(c(0,1), nrow(pos), replace = T, prob = c(0.7, 0.3))
ind_neg <- sample(c(0,1), nrow(neg), replace = T, prob = c(0.7, 0.3))


train <- bind_rows(pos[ind_pos==0,],neg[ind_neg==0,])
test <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(`Health Insurance` ~ .,importance=T,data=train,
                    perf.type = "gmean",splitrule="gini")
Warning in rfsrc(formula = `Health Insurance` ~ ., data = structure(list(: empty classes found when implementing classification
print(rfobj)
                         Sample size: 1332
           Frequency of class labels: NA, 1332
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 1
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 842
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: Inf
                   (OOB) Brier score: 0
        (OOB) Normalized Brier score: 0
                           (OOB) AUC: NaN
                        (OOB) PR-AUC: NA
                        (OOB) G-mean: NaN
   (OOB) Requested performance error: 1

Confusion matrix:

          predicted
  observed    0 Yes class.error
       0      0   0         NaN
       Yes 1332   0           1

      (OOB) Misclassification rate: 1
print(rfobj)
                         Sample size: 1332
           Frequency of class labels: NA, 1332
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 1
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 842
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: Inf
                   (OOB) Brier score: 0
        (OOB) Normalized Brier score: 0
                           (OOB) AUC: NaN
                        (OOB) PR-AUC: NA
                        (OOB) G-mean: NaN
   (OOB) Requested performance error: 1

Confusion matrix:

          predicted
  observed    0 Yes class.error
       0      0   0         NaN
       Yes 1332   0           1

      (OOB) Misclassification rate: 1
plot(rfobj,plots.one.page = FALSE)


                          all    0   Yes
Community Trust             0   NA    NA
Get Along                   0   NA    NA
Community Shares Values     0   NA    NA
Helpful Community           0   NA    NA
Close-knit Community        0   NA    NA
Religious Importance        0   NA    NA
Religious Attendance        0   NA    NA
Togetherness                0   NA    NA
Feel Close                  0   NA    NA
Spend Time Together         0   NA    NA
Expression                  0   NA    NA
Family Pride                0   NA    NA
Loyalty                     0   NA    NA
Trust                       0   NA    NA
Successful Family           0   NA    NA
Similar Values              0   NA    NA
Family Respect              0   NA    NA
Helpful Friends             0   NA    NA
Close Friends               0   NA    NA
See Friends                 0   NA    NA
Helpful Family              0   NA    NA
Close Family                0   NA    NA
See Family                  0   NA    NA
EnglishDiff                 0   NA    NA
EnglishSpeak                0   NA    NA
Income_median               0   NA    NA
rfobj$importance
                        all  0 Yes
Ethnicity                 0 NA  NA
Age                       0 NA  NA
Gender                    0 NA  NA
Religion                  0 NA  NA
Employment                0 NA  NA
Income_median             0 NA  NA
EnglishSpeak              0 NA  NA
EnglishDiff               0 NA  NA
See Family                0 NA  NA
Close Family              0 NA  NA
Helpful Family            0 NA  NA
See Friends               0 NA  NA
Close Friends             0 NA  NA
Helpful Friends           0 NA  NA
Family Respect            0 NA  NA
Similar Values            0 NA  NA
Successful Family         0 NA  NA
Trust                     0 NA  NA
Loyalty                   0 NA  NA
Family Pride              0 NA  NA
Expression                0 NA  NA
Spend Time Together       0 NA  NA
Feel Close                0 NA  NA
Togetherness              0 NA  NA
Religious Attendance      0 NA  NA
Religious Importance      0 NA  NA
Close-knit Community      0 NA  NA
Helpful Community         0 NA  NA
Community Shares Values   0 NA  NA
Get Along                 0 NA  NA
Community Trust           0 NA  NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test)
get.imbalanced.performance(rfobj)
 n.majority  n.minority      iratio   threshold        sens        spec 
       1332           0         Inf           0         NaN           0 
       prec         npv    misclass       brier  brier.norm         auc 
          0         NaN           1           0           0         NaN 
         F1       F1mod pr.auc.rand      pr.auc     F1gmean  F1modgmean 
        NaN         NaN          NA          NA         NaN         NaN 
      gmean 
        NaN 

Dental Insurance

ps(`Dental Insurance`)
# A tibble: 3 × 3
  `Dental Insurance`     n   pct
  <fct>              <int> <dbl>
1 0                   1050 40.2 
2 Yes                 1529 58.6 
3 <NA>                  30  1.15

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Dental Insurance`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame()

imb <- imbalanced(`Dental Insurance` ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="gini")
print(imb)
                         Sample size: 2184
           Frequency of class labels: 849, 1335
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 397.2453
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1380
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.5724
                   (OOB) Brier score: 0.17717336
        (OOB) Normalized Brier score: 0.70869343
                           (OOB) AUC: 0.79752209
                        (OOB) PR-AUC: 0.70202991
                        (OOB) G-mean: 0.72735508
   (OOB) Requested performance error: 0.27264492

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   642 207      0.2438
       Yes 401 934      0.3004

      (OOB) Misclassification rate: 0.2783883
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1335.0000000  849.0000000    1.5724382    0.3887363    0.7561837    0.6996255 
        prec          npv     misclass        brier   brier.norm          auc 
   0.6155321    0.8185802    0.2783883    0.1771734    0.7086934    0.7975221 
          F1        F1mod  pr.auc.rand       pr.auc      F1gmean   F1modgmean 
   0.6786469    0.7145404    0.3887363    0.7020299    0.7030010    0.7209477 
       gmean 
   0.7273551 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

pos<- rfdata |> filter(`Dental Insurance`=="Yes")
neg <- rfdata |> filter(`Dental Insurance`=="No")

set.seed(222)
ind_pos <- sample(c(0,1), nrow(pos), replace = T, prob = c(0.7, 0.3))
ind_neg <- sample(c(0,1), nrow(neg), replace = T, prob = c(0.7, 0.3))


train <- bind_rows(pos[ind_pos==0,],neg[ind_neg==0,])
test <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(`Dental Insurance` ~ .,importance=T,data=train,
                    perf.type = "gmean",splitrule="gini")
Warning in rfsrc(formula = `Dental Insurance` ~ ., data = structure(list(: empty classes found when implementing classification
print(rfobj)
                         Sample size: 932
           Frequency of class labels: NA, 932
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 1
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 589
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: Inf
                   (OOB) Brier score: 0
        (OOB) Normalized Brier score: 0
                           (OOB) AUC: NaN
                        (OOB) PR-AUC: NA
                        (OOB) G-mean: NaN
   (OOB) Requested performance error: 1

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0     0   0         NaN
       Yes 932   0           1

      (OOB) Misclassification rate: 1
print(rfobj)
                         Sample size: 932
           Frequency of class labels: NA, 932
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 1
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 589
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: Inf
                   (OOB) Brier score: 0
        (OOB) Normalized Brier score: 0
                           (OOB) AUC: NaN
                        (OOB) PR-AUC: NA
                        (OOB) G-mean: NaN
   (OOB) Requested performance error: 1

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0     0   0         NaN
       Yes 932   0           1

      (OOB) Misclassification rate: 1
plot(rfobj,plots.one.page = FALSE)


                          all    0   Yes
Community Trust             0   NA    NA
Get Along                   0   NA    NA
Community Shares Values     0   NA    NA
Helpful Community           0   NA    NA
Close-knit Community        0   NA    NA
Religious Importance        0   NA    NA
Religious Attendance        0   NA    NA
Togetherness                0   NA    NA
Feel Close                  0   NA    NA
Spend Time Together         0   NA    NA
Expression                  0   NA    NA
Family Pride                0   NA    NA
Loyalty                     0   NA    NA
Trust                       0   NA    NA
Successful Family           0   NA    NA
Similar Values              0   NA    NA
Family Respect              0   NA    NA
Helpful Friends             0   NA    NA
Close Friends               0   NA    NA
See Friends                 0   NA    NA
Helpful Family              0   NA    NA
Close Family                0   NA    NA
See Family                  0   NA    NA
EnglishDiff                 0   NA    NA
EnglishSpeak                0   NA    NA
Income_median               0   NA    NA
rfobj$importance
                        all  0 Yes
Ethnicity                 0 NA  NA
Age                       0 NA  NA
Gender                    0 NA  NA
Religion                  0 NA  NA
Employment                0 NA  NA
Income_median             0 NA  NA
EnglishSpeak              0 NA  NA
EnglishDiff               0 NA  NA
See Family                0 NA  NA
Close Family              0 NA  NA
Helpful Family            0 NA  NA
See Friends               0 NA  NA
Close Friends             0 NA  NA
Helpful Friends           0 NA  NA
Family Respect            0 NA  NA
Similar Values            0 NA  NA
Successful Family         0 NA  NA
Trust                     0 NA  NA
Loyalty                   0 NA  NA
Family Pride              0 NA  NA
Expression                0 NA  NA
Spend Time Together       0 NA  NA
Feel Close                0 NA  NA
Togetherness              0 NA  NA
Religious Attendance      0 NA  NA
Religious Importance      0 NA  NA
Close-knit Community      0 NA  NA
Helpful Community         0 NA  NA
Community Shares Values   0 NA  NA
Get Along                 0 NA  NA
Community Trust           0 NA  NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test)
get.imbalanced.performance(rfobj)
 n.majority  n.minority      iratio   threshold        sens        spec 
        932           0         Inf           0         NaN           0 
       prec         npv    misclass       brier  brier.norm         auc 
          0         NaN           1           0           0         NaN 
         F1       F1mod pr.auc.rand      pr.auc     F1gmean  F1modgmean 
        NaN         NaN          NA          NA         NaN         NaN 
      gmean 
        NaN 

Physical Checkup

ps(`Physical Check-up`)
# A tibble: 3 × 3
  `Physical Check-up`     n   pct
  <fct>               <int> <dbl>
1 0                     833 31.9 
2 Yes                  1740 66.7 
3 <NA>                   36  1.38

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Physical Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame()

imb <- imbalanced(`Physical Check-up` ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="gini")
print(imb)
                         Sample size: 2178
           Frequency of class labels: 704, 1474
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 425.347
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1376
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 2.0938
                   (OOB) Brier score: 0.19839173
        (OOB) Normalized Brier score: 0.79356692
                           (OOB) AUC: 0.68853017
                        (OOB) PR-AUC: 0.49762209
                        (OOB) G-mean: 0.63818128
   (OOB) Requested performance error: 0.36181872

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   492 212      0.3011
       Yes 615 859      0.4172

      (OOB) Misclassification rate: 0.3797062
plot(imb,plots.one.page = F)


                           all    0   Yes
Age                     0.0537   NA    NA
Income_median           0.0194   NA    NA
Ethnicity               0.0109   NA    NA
Expression              0.0071   NA    NA
Employment              0.0048   NA    NA
Religious Importance    0.0040   NA    NA
Loyalty                 0.0036   NA    NA
Get Along               0.0026   NA    NA
Gender                  0.0022   NA    NA
EnglishDiff             0.0012   NA    NA
EnglishSpeak            0.0010   NA    NA
Helpful Friends         0.0002   NA    NA
Religious Attendance   -0.0002   NA    NA
Family Pride           -0.0003   NA    NA
Religion               -0.0006   NA    NA
Successful Family      -0.0012   NA    NA
Togetherness           -0.0013   NA    NA
Close-knit Community   -0.0017   NA    NA
Spend Time Together    -0.0019   NA    NA
Helpful Family         -0.0019   NA    NA
Close Family           -0.0020   NA    NA
See Family             -0.0026   NA    NA
Feel Close             -0.0028   NA    NA
Helpful Community      -0.0034   NA    NA
Similar Values         -0.0036   NA    NA
Community Trust        -0.0046   NA    NA
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1474.0000000  704.0000000    2.0937500    0.3232323    0.6988636    0.5827680 
        prec          npv     misclass        brier   brier.norm          auc 
   0.4444444    0.8020542    0.3797062    0.1983917    0.7935669    0.6885302 
          F1        F1mod  pr.auc.rand       pr.auc      F1gmean   F1modgmean 
   0.5433462    0.6020794    0.3232323    0.4976221    0.5907637    0.6201304 
       gmean 
   0.6381813 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

pos<- rfdata |> filter(`Physical Check-up`=="Yes")
neg <- rfdata |> filter(`Physical Check-up`=="No")

set.seed(222)
ind_pos <- sample(c(0,1), nrow(pos), replace = T, prob = c(0.7, 0.3))
ind_neg <- sample(c(0,1), nrow(neg), replace = T, prob = c(0.7, 0.3))


train <- bind_rows(pos[ind_pos==0,],neg[ind_neg==0,])
test <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(`Physical Check-up` ~ .,importance=T,data=train,
                    perf.type = "gmean",splitrule="gini")
Warning in rfsrc(formula = `Physical Check-up` ~ ., data = structure(list(: empty classes found when implementing classification
print(rfobj)
                         Sample size: 1028
           Frequency of class labels: NA, 1028
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 1
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 650
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: Inf
                   (OOB) Brier score: 0
        (OOB) Normalized Brier score: 0
                           (OOB) AUC: NaN
                        (OOB) PR-AUC: NA
                        (OOB) G-mean: NaN
   (OOB) Requested performance error: 1

Confusion matrix:

          predicted
  observed    0 Yes class.error
       0      0   0         NaN
       Yes 1028   0           1

      (OOB) Misclassification rate: 1
print(rfobj)
                         Sample size: 1028
           Frequency of class labels: NA, 1028
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 1
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 650
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: Inf
                   (OOB) Brier score: 0
        (OOB) Normalized Brier score: 0
                           (OOB) AUC: NaN
                        (OOB) PR-AUC: NA
                        (OOB) G-mean: NaN
   (OOB) Requested performance error: 1

Confusion matrix:

          predicted
  observed    0 Yes class.error
       0      0   0         NaN
       Yes 1028   0           1

      (OOB) Misclassification rate: 1
plot(rfobj,plots.one.page = FALSE)


                          all    0   Yes
Community Trust             0   NA    NA
Get Along                   0   NA    NA
Community Shares Values     0   NA    NA
Helpful Community           0   NA    NA
Close-knit Community        0   NA    NA
Religious Importance        0   NA    NA
Religious Attendance        0   NA    NA
Togetherness                0   NA    NA
Feel Close                  0   NA    NA
Spend Time Together         0   NA    NA
Expression                  0   NA    NA
Family Pride                0   NA    NA
Loyalty                     0   NA    NA
Trust                       0   NA    NA
Successful Family           0   NA    NA
Similar Values              0   NA    NA
Family Respect              0   NA    NA
Helpful Friends             0   NA    NA
Close Friends               0   NA    NA
See Friends                 0   NA    NA
Helpful Family              0   NA    NA
Close Family                0   NA    NA
See Family                  0   NA    NA
EnglishDiff                 0   NA    NA
EnglishSpeak                0   NA    NA
Income_median               0   NA    NA
rfobj$importance
                        all  0 Yes
Ethnicity                 0 NA  NA
Age                       0 NA  NA
Gender                    0 NA  NA
Religion                  0 NA  NA
Employment                0 NA  NA
Income_median             0 NA  NA
EnglishSpeak              0 NA  NA
EnglishDiff               0 NA  NA
See Family                0 NA  NA
Close Family              0 NA  NA
Helpful Family            0 NA  NA
See Friends               0 NA  NA
Close Friends             0 NA  NA
Helpful Friends           0 NA  NA
Family Respect            0 NA  NA
Similar Values            0 NA  NA
Successful Family         0 NA  NA
Trust                     0 NA  NA
Loyalty                   0 NA  NA
Family Pride              0 NA  NA
Expression                0 NA  NA
Spend Time Together       0 NA  NA
Feel Close                0 NA  NA
Togetherness              0 NA  NA
Religious Attendance      0 NA  NA
Religious Importance      0 NA  NA
Close-knit Community      0 NA  NA
Helpful Community         0 NA  NA
Community Shares Values   0 NA  NA
Get Along                 0 NA  NA
Community Trust           0 NA  NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test)
get.imbalanced.performance(rfobj)
 n.majority  n.minority      iratio   threshold        sens        spec 
       1028           0         Inf           0         NaN           0 
       prec         npv    misclass       brier  brier.norm         auc 
          0         NaN           1           0           0         NaN 
         F1       F1mod pr.auc.rand      pr.auc     F1gmean  F1modgmean 
        NaN         NaN          NA          NA         NaN         NaN 
      gmean 
        NaN 

Dental Checkup

ps(`Dentist Check-up`)
# A tibble: 3 × 3
  `Dentist Check-up`     n   pct
  <fct>              <int> <dbl>
1 0                   1100 42.2 
2 Yes                 1462 56.0 
3 <NA>                  47  1.80

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Dentist Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame()

imb <- imbalanced(`Dentist Check-up` ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="gini")
print(imb)
                         Sample size: 2175
           Frequency of class labels: 896, 1279
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 450.2923
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1375
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 1.4275
                   (OOB) Brier score: 0.21233082
        (OOB) Normalized Brier score: 0.84932329
                           (OOB) AUC: 0.70795578
                        (OOB) PR-AUC: 0.60124704
                        (OOB) G-mean: 0.65641478
   (OOB) Requested performance error: 0.34358522

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   618 278      0.3103
       Yes 480 799      0.3753

      (OOB) Misclassification rate: 0.3485057
plot(imb,plots.one.page = F)


                           all    0   Yes
EnglishSpeak            0.0169   NA    NA
Age                     0.0144   NA    NA
EnglishDiff             0.0103   NA    NA
Employment              0.0072   NA    NA
Gender                  0.0071   NA    NA
Similar Values          0.0070   NA    NA
Income_median           0.0051   NA    NA
Helpful Family          0.0049   NA    NA
Religious Importance    0.0047   NA    NA
Expression              0.0038   NA    NA
Togetherness            0.0035   NA    NA
Helpful Friends         0.0033   NA    NA
See Friends             0.0032   NA    NA
Successful Family       0.0031   NA    NA
Family Pride            0.0029   NA    NA
Loyalty                 0.0017   NA    NA
See Family              0.0016   NA    NA
Feel Close              0.0005   NA    NA
Close Family            0.0004   NA    NA
Get Along               0.0003   NA    NA
Ethnicity               0.0003   NA    NA
Community Trust         0.0001   NA    NA
Religious Attendance    0.0000   NA    NA
Spend Time Together    -0.0001   NA    NA
Close Friends          -0.0010   NA    NA
Religion               -0.0028   NA    NA
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1279.0000000  896.0000000    1.4274554    0.4119540    0.6897321    0.6247068 
        prec          npv     misclass        brier   brier.norm          auc 
   0.5628415    0.7418756    0.3485057    0.2123308    0.8493233    0.7079558 
          F1        F1mod  pr.auc.rand       pr.auc      F1gmean   F1modgmean 
   0.6198596    0.6477499    0.4119540    0.6012470    0.6381372    0.6520823 
       gmean 
   0.6564148 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

pos<- rfdata |> filter(`Dentist Check-up`=="Yes")
neg <- rfdata |> filter(`Dentist Check-up`=="No")

set.seed(222)
ind_pos <- sample(c(0,1), nrow(pos), replace = T, prob = c(0.7, 0.3))
ind_neg <- sample(c(0,1), nrow(neg), replace = T, prob = c(0.7, 0.3))


train <- bind_rows(pos[ind_pos==0,],neg[ind_neg==0,])
test <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(`Dentist Check-up` ~ .,importance=T,data=train,
                    perf.type = "gmean",splitrule="gini")
Warning in rfsrc(formula = `Dentist Check-up` ~ ., data = structure(list(: empty classes found when implementing classification
print(rfobj)
                         Sample size: 896
           Frequency of class labels: NA, 896
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 1
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 566
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: Inf
                   (OOB) Brier score: 0
        (OOB) Normalized Brier score: 0
                           (OOB) AUC: NaN
                        (OOB) PR-AUC: NA
                        (OOB) G-mean: NaN
   (OOB) Requested performance error: 1

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0     0   0         NaN
       Yes 896   0           1

      (OOB) Misclassification rate: 1
print(rfobj)
                         Sample size: 896
           Frequency of class labels: NA, 896
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 1
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 566
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: Inf
                   (OOB) Brier score: 0
        (OOB) Normalized Brier score: 0
                           (OOB) AUC: NaN
                        (OOB) PR-AUC: NA
                        (OOB) G-mean: NaN
   (OOB) Requested performance error: 1

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0     0   0         NaN
       Yes 896   0           1

      (OOB) Misclassification rate: 1
plot(rfobj,plots.one.page = FALSE)


                          all    0   Yes
Community Trust             0   NA    NA
Get Along                   0   NA    NA
Community Shares Values     0   NA    NA
Helpful Community           0   NA    NA
Close-knit Community        0   NA    NA
Religious Importance        0   NA    NA
Religious Attendance        0   NA    NA
Togetherness                0   NA    NA
Feel Close                  0   NA    NA
Spend Time Together         0   NA    NA
Expression                  0   NA    NA
Family Pride                0   NA    NA
Loyalty                     0   NA    NA
Trust                       0   NA    NA
Successful Family           0   NA    NA
Similar Values              0   NA    NA
Family Respect              0   NA    NA
Helpful Friends             0   NA    NA
Close Friends               0   NA    NA
See Friends                 0   NA    NA
Helpful Family              0   NA    NA
Close Family                0   NA    NA
See Family                  0   NA    NA
EnglishDiff                 0   NA    NA
EnglishSpeak                0   NA    NA
Income_median               0   NA    NA
rfobj$importance
                        all  0 Yes
Ethnicity                 0 NA  NA
Age                       0 NA  NA
Gender                    0 NA  NA
Religion                  0 NA  NA
Employment                0 NA  NA
Income_median             0 NA  NA
EnglishSpeak              0 NA  NA
EnglishDiff               0 NA  NA
See Family                0 NA  NA
Close Family              0 NA  NA
Helpful Family            0 NA  NA
See Friends               0 NA  NA
Close Friends             0 NA  NA
Helpful Friends           0 NA  NA
Family Respect            0 NA  NA
Similar Values            0 NA  NA
Successful Family         0 NA  NA
Trust                     0 NA  NA
Loyalty                   0 NA  NA
Family Pride              0 NA  NA
Expression                0 NA  NA
Spend Time Together       0 NA  NA
Feel Close                0 NA  NA
Togetherness              0 NA  NA
Religious Attendance      0 NA  NA
Religious Importance      0 NA  NA
Close-knit Community      0 NA  NA
Helpful Community         0 NA  NA
Community Shares Values   0 NA  NA
Get Along                 0 NA  NA
Community Trust           0 NA  NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test)
get.imbalanced.performance(rfobj)
 n.majority  n.minority      iratio   threshold        sens        spec 
        896           0         Inf           0         NaN           0 
       prec         npv    misclass       brier  brier.norm         auc 
          0         NaN           1           0           0         NaN 
         F1       F1mod pr.auc.rand      pr.auc     F1gmean  F1modgmean 
        NaN         NaN          NA          NA         NaN         NaN 
      gmean 
        NaN 

Urgent Care

ps(`Urgentcare`)
# A tibble: 3 × 3
  Urgentcare     n   pct
  <fct>      <int> <dbl>
1 0           2112 81.0 
2 Yes          440 16.9 
3 <NA>          57  2.18

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Urgentcare`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame()

imb <- imbalanced(`Urgentcare` ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="gini")
print(imb)
                         Sample size: 2167
           Frequency of class labels: 1808, 359
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 326.3993
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1370
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 5.0362
                   (OOB) Brier score: 0.13731167
        (OOB) Normalized Brier score: 0.54924669
                           (OOB) AUC: 0.59106078
                        (OOB) PR-AUC: 0.23239721
                        (OOB) G-mean: 0.5633998
   (OOB) Requested performance error: 0.4366002

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0   873 935      0.5171
       Yes 123 236      0.3426

      (OOB) Misclassification rate: 0.4882326
plot(imb,plots.one.page = F)


                          all    0   Yes
Ethnicity              0.0307   NA    NA
Close Family           0.0301   NA    NA
Age                    0.0256   NA    NA
Trust                  0.0212   NA    NA
EnglishSpeak           0.0189   NA    NA
Get Along              0.0179   NA    NA
Religion               0.0179   NA    NA
Close Friends          0.0173   NA    NA
Loyalty                0.0170   NA    NA
Togetherness           0.0164   NA    NA
Feel Close             0.0143   NA    NA
Family Respect         0.0138   NA    NA
Family Pride           0.0135   NA    NA
Helpful Family         0.0135   NA    NA
Helpful Friends        0.0130   NA    NA
See Friends            0.0120   NA    NA
Helpful Community      0.0116   NA    NA
Employment             0.0112   NA    NA
Spend Time Together    0.0111   NA    NA
Religious Importance   0.0109   NA    NA
Income_median          0.0109   NA    NA
Similar Values         0.0105   NA    NA
Successful Family      0.0096   NA    NA
Religious Attendance   0.0096   NA    NA
Community Trust        0.0086   NA    NA
See Family             0.0083   NA    NA
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1808.0000000  359.0000000    5.0362117    0.1656668    0.6573816    0.4828540 
        prec          npv     misclass        brier   brier.norm          auc 
   0.2015371    0.8765060    0.4882326    0.1373117    0.5492467    0.5910608 
          F1        F1mod  pr.auc.rand       pr.auc      F1gmean   F1modgmean 
   0.3084967    0.4125853    0.1656668    0.2323972    0.4359483    0.4879926 
       gmean 
   0.5633998 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

pos<- rfdata |> filter(`Urgentcare`=="Yes")
neg <- rfdata |> filter(`Urgentcare`=="No")

set.seed(222)
ind_pos <- sample(c(0,1), nrow(pos), replace = T, prob = c(0.7, 0.3))
ind_neg <- sample(c(0,1), nrow(neg), replace = T, prob = c(0.7, 0.3))


train <- bind_rows(pos[ind_pos==0,],neg[ind_neg==0,])
test <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(`Urgentcare`~ .,importance=T,data=train,
                    perf.type = "gmean",splitrule="gini")
Warning in rfsrc(formula = Urgentcare ~ ., data = structure(list(Urgentcare = structure(c(2L, : empty classes found when implementing classification
print(rfobj)
                         Sample size: 251
           Frequency of class labels: NA, 251
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 1
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 159
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: Inf
                   (OOB) Brier score: 0
        (OOB) Normalized Brier score: 0
                           (OOB) AUC: NaN
                        (OOB) PR-AUC: NA
                        (OOB) G-mean: NaN
   (OOB) Requested performance error: 1

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0     0   0         NaN
       Yes 251   0           1

      (OOB) Misclassification rate: 1
print(rfobj)
                         Sample size: 251
           Frequency of class labels: NA, 251
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 1
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 159
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: Inf
                   (OOB) Brier score: 0
        (OOB) Normalized Brier score: 0
                           (OOB) AUC: NaN
                        (OOB) PR-AUC: NA
                        (OOB) G-mean: NaN
   (OOB) Requested performance error: 1

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0     0   0         NaN
       Yes 251   0           1

      (OOB) Misclassification rate: 1
plot(rfobj,plots.one.page = FALSE)


                          all    0   Yes
Community Trust             0   NA    NA
Get Along                   0   NA    NA
Community Shares Values     0   NA    NA
Helpful Community           0   NA    NA
Close-knit Community        0   NA    NA
Religious Importance        0   NA    NA
Religious Attendance        0   NA    NA
Togetherness                0   NA    NA
Feel Close                  0   NA    NA
Spend Time Together         0   NA    NA
Expression                  0   NA    NA
Family Pride                0   NA    NA
Loyalty                     0   NA    NA
Trust                       0   NA    NA
Successful Family           0   NA    NA
Similar Values              0   NA    NA
Family Respect              0   NA    NA
Helpful Friends             0   NA    NA
Close Friends               0   NA    NA
See Friends                 0   NA    NA
Helpful Family              0   NA    NA
Close Family                0   NA    NA
See Family                  0   NA    NA
EnglishDiff                 0   NA    NA
EnglishSpeak                0   NA    NA
Income_median               0   NA    NA
rfobj$importance
                        all  0 Yes
Ethnicity                 0 NA  NA
Age                       0 NA  NA
Gender                    0 NA  NA
Religion                  0 NA  NA
Employment                0 NA  NA
Income_median             0 NA  NA
EnglishSpeak              0 NA  NA
EnglishDiff               0 NA  NA
See Family                0 NA  NA
Close Family              0 NA  NA
Helpful Family            0 NA  NA
See Friends               0 NA  NA
Close Friends             0 NA  NA
Helpful Friends           0 NA  NA
Family Respect            0 NA  NA
Similar Values            0 NA  NA
Successful Family         0 NA  NA
Trust                     0 NA  NA
Loyalty                   0 NA  NA
Family Pride              0 NA  NA
Expression                0 NA  NA
Spend Time Together       0 NA  NA
Feel Close                0 NA  NA
Togetherness              0 NA  NA
Religious Attendance      0 NA  NA
Religious Importance      0 NA  NA
Close-knit Community      0 NA  NA
Helpful Community         0 NA  NA
Community Shares Values   0 NA  NA
Get Along                 0 NA  NA
Community Trust           0 NA  NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test)
get.imbalanced.performance(rfobj)
 n.majority  n.minority      iratio   threshold        sens        spec 
        251           0         Inf           0         NaN           0 
       prec         npv    misclass       brier  brier.norm         auc 
          0         NaN           1           0           0         NaN 
         F1       F1mod pr.auc.rand      pr.auc     F1gmean  F1modgmean 
        NaN         NaN          NA          NA         NaN         NaN 
      gmean 
        NaN 

Folk Medicine

ps(`Folkmedicine`)
# A tibble: 3 × 3
  Folkmedicine     n   pct
  <fct>        <int> <dbl>
1 0             2189 83.9 
2 Yes            348 13.3 
3 <NA>            72  2.76

Random Forest (randomForestSRC)

#install.packages("randomForestSRC)

rfdata <- qol |> 
  select(`Folkmedicine`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
  na.omit() |> 
  rename(Employment=`Full Time Employment`,
         EnglishSpeak=`English Speaking`,
         EnglishDiff=`English Difficulties`) |> 
  as.data.frame()

imb <- imbalanced(`Folkmedicine` ~ .,importance=T,data=rfdata,
                    perf.type = "gmean",splitrule="gini")
print(imb)
                         Sample size: 2152
           Frequency of class labels: 1866, 286
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 271.6027
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 1360
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: 6.5245
                   (OOB) Brier score: 0.11195505
        (OOB) Normalized Brier score: 0.44782019
                           (OOB) AUC: 0.66685311
                        (OOB) PR-AUC: 0.21743813
                        (OOB) G-mean: 0.62098892
   (OOB) Requested performance error: 0.37901108

Confusion matrix:

          predicted
  observed    0 Yes class.error
       0   1029 837      0.4486
       Yes   86 200      0.3007

      (OOB) Misclassification rate: 0.4289033
plot(imb,plots.one.page = F)


                              all    0   Yes
Ethnicity                  0.0446   NA    NA
Age                        0.0441   NA    NA
EnglishSpeak               0.0130   NA    NA
Family Pride               0.0088   NA    NA
Employment                 0.0082   NA    NA
Helpful Friends            0.0080   NA    NA
Successful Family          0.0064   NA    NA
Expression                 0.0052   NA    NA
Feel Close                 0.0049   NA    NA
Community Trust            0.0040   NA    NA
Close Friends              0.0037   NA    NA
EnglishDiff                0.0037   NA    NA
Community Shares Values    0.0030   NA    NA
Togetherness               0.0027   NA    NA
Loyalty                    0.0025   NA    NA
Religious Importance       0.0024   NA    NA
Trust                      0.0023   NA    NA
See Friends                0.0023   NA    NA
Religion                   0.0021   NA    NA
Close Family               0.0021   NA    NA
Spend Time Together        0.0016   NA    NA
Get Along                  0.0009   NA    NA
Family Respect             0.0006   NA    NA
Close-knit Community       0.0002   NA    NA
Similar Values            -0.0003   NA    NA
Helpful Family            -0.0010   NA    NA
get.imbalanced.performance(imb)
  n.majority   n.minority       iratio    threshold         sens         spec 
1866.0000000  286.0000000    6.5244755    0.1328996    0.6993007    0.5514469 
        prec          npv     misclass        brier   brier.norm          auc 
   0.1928640    0.9228700    0.4289033    0.1119550    0.4478202    0.6668531 
          F1        F1mod  pr.auc.rand       pr.auc      F1gmean   F1modgmean 
   0.3023432    0.4205220    0.1328996    0.2174381    0.4616660    0.5207555 
       gmean 
   0.6209889 
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
  
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_minimal()
  
plot(importance_plot)

Training/Test set Variable Importance

Training Importance

pos<- rfdata |> filter(`Folkmedicine`=="Yes")
neg <- rfdata |> filter(`Folkmedicine`=="No")

set.seed(222)
ind_pos <- sample(c(0,1), nrow(pos), replace = T, prob = c(0.7, 0.3))
ind_neg <- sample(c(0,1), nrow(neg), replace = T, prob = c(0.7, 0.3))


train <- bind_rows(pos[ind_pos==0,],neg[ind_neg==0,])
test <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])

# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(`Folkmedicine` ~ .,importance=T,data=train,
                    perf.type = "gmean",splitrule="gini")
Warning in rfsrc(formula = Folkmedicine ~ ., data = structure(list(Folkmedicine = structure(c(2L, : empty classes found when implementing classification
print(rfobj)
                         Sample size: 200
           Frequency of class labels: NA, 200
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 1
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 126
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: Inf
                   (OOB) Brier score: 0
        (OOB) Normalized Brier score: 0
                           (OOB) AUC: NaN
                        (OOB) PR-AUC: NA
                        (OOB) G-mean: NaN
   (OOB) Requested performance error: 1

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0     0   0         NaN
       Yes 200   0           1

      (OOB) Misclassification rate: 1
print(rfobj)
                         Sample size: 200
           Frequency of class labels: NA, 200
                     Number of trees: 3000
           Forest terminal node size: 1
       Average no. of terminal nodes: 1
No. of variables tried at each split: 6
              Total no. of variables: 31
       Resampling used to grow trees: swor
    Resample size used to grow trees: 126
                            Analysis: RFQ
                              Family: class
                      Splitting rule: gini *random*
       Number of random split points: 10
                    Imbalanced ratio: Inf
                   (OOB) Brier score: 0
        (OOB) Normalized Brier score: 0
                           (OOB) AUC: NaN
                        (OOB) PR-AUC: NA
                        (OOB) G-mean: NaN
   (OOB) Requested performance error: 1

Confusion matrix:

          predicted
  observed   0 Yes class.error
       0     0   0         NaN
       Yes 200   0           1

      (OOB) Misclassification rate: 1
plot(rfobj,plots.one.page = FALSE)


                          all    0   Yes
Community Trust             0   NA    NA
Get Along                   0   NA    NA
Community Shares Values     0   NA    NA
Helpful Community           0   NA    NA
Close-knit Community        0   NA    NA
Religious Importance        0   NA    NA
Religious Attendance        0   NA    NA
Togetherness                0   NA    NA
Feel Close                  0   NA    NA
Spend Time Together         0   NA    NA
Expression                  0   NA    NA
Family Pride                0   NA    NA
Loyalty                     0   NA    NA
Trust                       0   NA    NA
Successful Family           0   NA    NA
Similar Values              0   NA    NA
Family Respect              0   NA    NA
Helpful Friends             0   NA    NA
Close Friends               0   NA    NA
See Friends                 0   NA    NA
Helpful Family              0   NA    NA
Close Family                0   NA    NA
See Family                  0   NA    NA
EnglishDiff                 0   NA    NA
EnglishSpeak                0   NA    NA
Income_median               0   NA    NA
rfobj$importance
                        all  0 Yes
Ethnicity                 0 NA  NA
Age                       0 NA  NA
Gender                    0 NA  NA
Religion                  0 NA  NA
Employment                0 NA  NA
Income_median             0 NA  NA
EnglishSpeak              0 NA  NA
EnglishDiff               0 NA  NA
See Family                0 NA  NA
Close Family              0 NA  NA
Helpful Family            0 NA  NA
See Friends               0 NA  NA
Close Friends             0 NA  NA
Helpful Friends           0 NA  NA
Family Respect            0 NA  NA
Similar Values            0 NA  NA
Successful Family         0 NA  NA
Trust                     0 NA  NA
Loyalty                   0 NA  NA
Family Pride              0 NA  NA
Expression                0 NA  NA
Spend Time Together       0 NA  NA
Feel Close                0 NA  NA
Togetherness              0 NA  NA
Religious Attendance      0 NA  NA
Religious Importance      0 NA  NA
Close-knit Community      0 NA  NA
Helpful Community         0 NA  NA
Community Shares Values   0 NA  NA
Get Along                 0 NA  NA
Community Trust           0 NA  NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)

importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
  geom_bar(stat = "identity", fill = "#F8766D") +
  coord_flip() +
  labs(title = "Variable Importance", x = "Variable", y = "Importance") +
  theme_bw()
  
plot(importance_plot)

Test Set Importance

test_rf <- predict.rfsrc(rfobj,newdata=test)
get.imbalanced.performance(rfobj)
 n.majority  n.minority      iratio   threshold        sens        spec 
        200           0         Inf           0         NaN           0 
       prec         npv    misclass       brier  brier.norm         auc 
          0         NaN           1           0           0         NaN 
         F1       F1mod pr.auc.rand      pr.auc     F1gmean  F1modgmean 
        NaN         NaN          NA          NA         NaN         NaN 
      gmean 
        NaN